*Set up working dataset for WHD case-level data

*******************************************************************************
*Load public WHD WHISARD data: This gives back wages agreed to pay (not always the same as back wages assessed), CMPs assessed, and Repeat/Willful violator designation, but does not give liquidated damages or hot goods violations

*download from https://enforcedata.dol.gov/views/data_summary.php
import delimited "$whdpublic/whd_whisard", clear
rename case_id CASE_ID

*Keep only FLSA wage and hour violations where back wages were owed
keep if flsa_violtn_cnt>0
keep if flsa_bw_atp_amt>0 & !missing(flsa_bw_atp_amt)

*Limit to 50 states + DC
drop if st_cd=="GU"|st_cd=="PR"|st_cd=="MP"|st_cd=="VI"|st_cd=="MH"|st_cd=="FM"|st_cd=="AS"

*Merge with KASE data from the FOIA request: This gives LDs
merge 1:1 CASE_ID using "$whd/kase.dta", gen(_m_kase) keep(1 3)

*Merge with case-level data on hot goods violations from the FOIA request
merge 1:1 CASE_ID using "$whd/hot_goods.dta", gen(_m_hotgoods)

*Keep only items where we have back wages in the KASE data - this drops 7 cases with positive BW in the public data but zero or missing in the KASE data
keep if !missing(AMT_BW_ASSESSED)
keep if AMT_BW_ASSESSED!=0

*Generate liquidated damage per $ of back wages
replace AMT_LD_ASSESSED = 0 if missing(AMT_LD_ASSESSED)
gen ld_per_bw = AMT_LD_ASSESSED / AMT_BW_ASSESSED

*Generate civil monetary penalty per $ of back wages
gen cmp_per_bw = flsa_cmp_assd_amt / AMT_BW_ASSESSED

*Start and end year
gen findings_start_date_n = date(findings_start_date, "YMD")
format findings_start_date_n %td
gen findings_end_date_n = date(findings_end_date, "YMD")
format findings_end_date_n %td
gen findings_start_year = year(findings_start_date_n)
gen findings_end_year = year(findings_end_date_n)
gen year_concluded = floor(DATE_CONCLUDED/10000)

*Gen repeat and/or willful binary category
gen tag_rw = (flsa_repeat_violator!="N/A")

*Gen decade split category
gen period = 1 if year_concluded<=2014
replace period = 2 if year_concluded>2014 & !missing(year_concluded)

*Tags for positive LD or CMPs
gen ld_nonzero = (AMT_LD_ASSESSED>0 & !missing(AMT_LD_ASSESSED))
gen cmp_nonzero = (flsa_cmp_assd_amt!=0 & !missing(flsa_cmp_assd_amt))

*2-digit 3-digit 4-digit naics codes
gen naics_2d = substr(naic_cd,1,2)
gen naics_3d = substr(naic_cd,1,3)
gen naics_4d = substr(naic_cd,1,4)

*Save working dataset
save "$whd/working.dta", replace
